diff -urN -x '*.orig' -x '*.rej' -x '*~' -x '.*' OpenJPEG.orig/libopenjpeg/t1.c OpenJPEG.patched/libopenjpeg/t1.c
--- OpenJPEG.orig/libopenjpeg/t1.c	2007-08-23 05:53:17.000000000 -0500
+++ OpenJPEG.patched/libopenjpeg/t1.c	2007-08-23 05:56:33.000000000 -0500
@@ -45,7 +45,11 @@
 static char t1_getspb(int f);
 static short t1_getnmsedec_sig(int x, int bitpos);
 static short t1_getnmsedec_ref(int x, int bitpos);
+#ifdef __amd64__
+static INLINE void t1_updateflags(flag_t *flagsp, int s, int stride);
+#else
 static void t1_updateflags(flag_t *flagsp, int s, int stride);
+#endif
 /**
 Encode significant pass
 */
@@ -258,6 +262,38 @@
     return lut_nmsedec_ref0[x & ((1 << T1_NMSEDEC_BITS) - 1)];
 }
 
+#ifdef __amd64__
+
+/* On 64 bit platforms we can set three flags at a time. (SWAR) */
+/* FIXME: Assumes little endian? */
+
+#define VEC(x,y,z) (int64)(x)|((int64)(y)<<16)|((int64)(z)<<32)
+
+static void t1_updateflags(flag_t *flagsp, int s, int stride) {
+	static const int64 mod[] = {
+		VEC(T1_SIG_SE,         T1_SIG_E,          T1_SIG_NE),
+		VEC(T1_SIG_SE,         T1_SIG_E|T1_SGN_E, T1_SIG_NE),
+		VEC(T1_SIG_S,          T1_SIG,            T1_SIG_N),
+		VEC(T1_SIG_S|T1_SGN_S, T1_SIG,            T1_SIG_N|T1_SGN_N),
+		VEC(T1_SIG_SW,         T1_SIG_W,          T1_SIG_NW),
+		VEC(T1_SIG_SW,         T1_SIG_W|T1_SGN_W, T1_SIG_NW)
+	};
+
+	int64 tmp1 = *(int64*)((void*)&flagsp[-1 - stride]);
+	int64 tmp2 = *(int64*)((void*)&flagsp[-1         ]);
+	int64 tmp3 = *(int64*)((void*)&flagsp[-1 + stride]);
+
+	tmp1 |= mod[s];
+	tmp2 |= mod[s+2];
+	tmp3 |= mod[s+4];
+
+	*(int64*)((void*)&flagsp[-1 - stride]) = tmp1;
+	*(int64*)((void*)&flagsp[-1         ]) = tmp2;
+	*(int64*)((void*)&flagsp[-1 + stride]) = tmp3;
+}
+
+#else
+
 static void t1_updateflags(flag_t *flagsp, int s, int stride) {
 	static const flag_t mod[] = {
 		T1_SIG_E, T1_SIG_E|T1_SGN_E,
@@ -279,6 +315,8 @@
 	flagsp[ 1 + stride] |= T1_SIG_NW;
 }
 
+#endif
+
 static void t1_enc_sigpass_step(
 		opj_t1_t *t1,
 		flag_t *flagsp,
@@ -670,6 +708,8 @@
 		for (i = 0; i < t1->w; ++i) {
 			if (k + 3 < t1->h) {
 #ifdef __amd64__
+				/* 64 bit SWAR */
+				/* FIXME: Assumes little endian? */
 				int64 tmp = *((int64*)&t1->flags[(k+1) + (i+1)*(t1->h+2)]);
 				if (cblksty & J2K_CCP_CBLKSTY_VSC) {
 					tmp &= ~((int64)(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S)<<48);
@@ -780,6 +820,11 @@
 	memset(t1->data,0,datasize * sizeof(int));
 
 	flagssize=(h+2) * (w+2);
+#ifdef __amd64__
+	/* 64 bit SIMD/SWAR in t1_updateflags requires one short of headroom
+	   because three shorts = 48 bits. */
+	++flagssize;
+#endif
 
 	if(flagssize > t1->flagssize){
 		opj_aligned_free(t1->flags);
